Any time you have issues, your first reference should be the ggplot2 help page at http://docs.ggplot2.org/current/.
Building blocks of a ggplot graph:
#install.packages('ISLR')
require(ISLR)
require(ggplot2)
require(reshape2)
data(Carseats)
head(Carseats)
## Sales CompPrice Income Advertising Population Price ShelveLoc Age
## 1 9.50 138 73 11 276 120 Bad 42
## 2 11.22 111 48 16 260 83 Good 65
## 3 10.06 113 35 10 269 80 Medium 59
## 4 7.40 117 100 4 466 97 Medium 55
## 5 4.15 141 64 3 340 128 Bad 38
## 6 10.81 124 113 13 501 72 Bad 78
## Education Urban US
## 1 17 Yes Yes
## 2 10 Yes Yes
## 3 12 Yes Yes
## 4 14 Yes Yes
## 5 13 Yes No
## 6 16 No Yes
data(EuStockMarkets)
stock_data = as.data.frame(EuStockMarkets)
stock_data$time = as.numeric(time(EuStockMarkets))
head(stock_data)
## DAX SMI CAC FTSE time
## 1 1628.75 1678.1 1772.8 2443.6 1991.496
## 2 1613.63 1688.5 1750.5 2460.2 1991.500
## 3 1606.51 1678.6 1718.0 2448.2 1991.504
## 4 1621.04 1684.1 1708.1 2470.4 1991.508
## 5 1618.16 1686.6 1723.1 2484.7 1991.512
## 6 1610.61 1671.6 1714.3 2466.8 1991.515
How do we map variables to features of the plot. Common examples:
plot1 = ggplot(data=Carseats, aes(x=Price, y=Sales, color=ShelveLoc, shape=Urban)) +
geom_point()
plot1
ggplot(data=Carseats, aes(x=Price, y=Sales, color=Advertising)) +
geom_point() +
xlab('Prices ($)') + ylab('Sales (thousands)') + ggtitle('Car Seat Sales')
Note that aesthetic mappings are inherited from the default aesthetic but can also be applied for specific pieces.
ggplot(data=Carseats, aes(x=Price, y=Sales)) +
geom_point(aes(color=US))
Similarly, data can also be applied for a specific object, but otherwise is inherited from the top-level.
ggplot(data=Carseats, aes(x=Price, y=Sales)) +
geom_point(data=subset(Carseats, US == 'Yes'), aes(color=Urban)) +
geom_point(data=subset(Carseats, US == 'No'))
What marks do we want on the plot. Each geom_ object tends to have certain aes arguments that it requires or can use. Examples:
geom_point for scatter plotsgeom_line for line plotsgeom_histogramgeom_textWhen using multiple geometries, you can direcly modify their attributes or add them to the aesthetic (so they get included in legends).
Carseats_addpred = Carseats
Carseats_addpred$Sales_pred = predict(lm(Sales ~ Price, data=Carseats))
ggplot(data=Carseats_addpred, aes(x=Price)) +
geom_point(aes(y=Sales)) +
geom_line(aes(y=Sales_pred), color='Magenta')
ggplot(data=Carseats_addpred, aes(x=Price)) +
geom_point(aes(y=Sales, color='Data')) +
geom_line(aes(y=Sales_pred, color='Predicted'))
Some miscellaneous other geom_ examples.
ggplot(data=stock_data[1:100,], aes(x=time, y=DAX)) +
geom_point() +
geom_smooth(span=0.3)
ggplot(data=Carseats, aes(x=Price, y=Sales)) +
geom_point() +
geom_smooth(method='lm', se=FALSE)
ggplot(data=Carseats, aes(x=Sales, color=ShelveLoc, fill=ShelveLoc)) +
geom_density(alpha=0.3)
ggplot(data=Carseats, aes(x=Price, y=Sales)) +
geom_point() +
geom_abline(intercept=10, slope=-0.04) +
geom_vline(xintercept=115, linetype='dashed')
require(mvtnorm)
dat = Carseats[,c('Price', 'Sales')]
xgrid = expand.grid(
Price=seq(24, 191, length.out=50),
Sales=seq(0, 16.3, length.out=50))
xgrid$p = dmvnorm(xgrid, mean=colMeans(dat), sigma=cov(dat))
ggplot(Carseats, aes(x=Price, y=Sales)) +
geom_point() +
geom_contour(aes(z=p), data=xgrid, breaks=3e-4)
ggplot(Carseats, aes(x=Price, y=Sales)) +
geom_point() +
stat_ellipse()
You’ll very often find that you need data in long format to plot it using ggplot, which means you will want to use melt from the reshape2 package. To change data in the other direction, from long to wide, you can use dcast.
head(stock_data)
## DAX SMI CAC FTSE time
## 1 1628.75 1678.1 1772.8 2443.6 1991.496
## 2 1613.63 1688.5 1750.5 2460.2 1991.500
## 3 1606.51 1678.6 1718.0 2448.2 1991.504
## 4 1621.04 1684.1 1708.1 2470.4 1991.508
## 5 1618.16 1686.6 1723.1 2484.7 1991.512
## 6 1610.61 1671.6 1714.3 2466.8 1991.515
stock_data_melted = melt(stock_data, id.vars='time')
head(stock_data_melted)
## time variable value
## 1 1991.496 DAX 1628.75
## 2 1991.500 DAX 1613.63
## 3 1991.504 DAX 1606.51
## 4 1991.508 DAX 1621.04
## 5 1991.512 DAX 1618.16
## 6 1991.515 DAX 1610.61
ggplot(data=stock_data_melted, aes(x=time, y=value, color=variable)) +
geom_line()
plot2 = ggplot(data=Carseats, aes(x=Price, y=Sales, color=Advertising, shape=ShelveLoc)) +
geom_point()
plot2 + scale_shape_discrete(name='Shelve Location',
breaks=c('Good', 'Medium', 'Bad'),
labels=c('G', 'M', 'B'))
plot2 + scale_color_continuous(name='Advertising Level',
breaks=seq(0, 30, 5),
low='grey', high='red')
require(RColorBrewer)
plot2 + scale_color_distiller(palette='YlOrRd', direction=1)
plot2 + scale_x_continuous(limits=c(0, 300))
plot2 + scale_y_log10()
plot2 + scale_y_reverse()
plot2 + coord_fixed(ratio=5)
plot2 + facet_grid(. ~ Urban)
plot2 + facet_grid(US ~ Urban, labeller=label_both)
ggplot(data=stock_data_melted, aes(x=time, y=value)) +
geom_line() +
facet_wrap(~variable)
plot2 + theme_bw()
plot2 + theme_minimal()
plot2 + theme(legend.position=c(0.98,0.98), legend.justification=c(1,1),
legend.box = "horizontal",
legend.background=element_rect(color="lightgrey"))
Plot the centers of each US state by their latitude and longitude. Label the states with their two-letter abbreviation. Size the dots according to each state’s population. Color the states according to their regional division. Note that you can load this data using data(state), after which state data will be located in:
state.centerstate.abbas.data.frame(state.x77)$Populationstate.divisiondowjones.csv contains weekly percentage changes for a set of 30 stocks. Attempt to replicate the following figure, which visualizes the correlation matrix of the stocks.
Hints:
geom_tile.melt on the correlation matrix.hclust(distance_matrix)$order to extract one based on hierarchical clustering, as long as you can get an appropriate distance matrix. You might need to use as.dist to convert a matrix to a “distance matrix” form.